# -*- coding: utf-8 -*-
# Python 3.8.3
"""
弗利尔美术馆中国藏品爬虫
https://www.freersackler.si.edu/collections/
14225 results for China
"""
import requests
import json
import re   # 正则
import os
import time
from bs4 import BeautifulSoup # 网页解析
from fake_useragent import UserAgent    # 生成随机请求头
import csv
import threading


class Freersackler_China:
    def __init__(self):
        # 请求url
        self.url = 'https://asia.si.edu/wp-json/fsep/v1/search' 
        # 请求头
        self.headers = {'user-agent':str(UserAgent(path="fake_useragent.json"))} 
        
    def get_details(self, i, posts):
        """获取文物详情，每页一共21条数据"""
        for j in range(0, 21):
            # 文字描述（这部分手动限制了一下所以数据比较好看）
            d = {}  # dict
            d["url"] = posts[j]["permalink"]    # 原本详情页链接
            print(d["url"])
            d["type"] = posts[j]["post_type"]
            d["title"] = posts[j]["post_title"] # 展品标题
            meta = posts[j]["meta"]
            dd = {}
            index = 0
            dd["object_name"] = meta["object_name"][index]["value"]
            dd["object_type"] = meta["object_type"][index]["value"]    # 展区
            dd["geography"] = meta["geography"][index]["value"]        # 产地
            dd["credit"] = meta["credit"][index]["value"]              # 捐赠信息
            dd["reign"] = meta["reign"][index]["value"]                # 朝代
            dd["dynasty"] = meta["dynasty"][index]["value"]            
            dd["period"] = meta["period"][index]["value"]              
            dd["time_period"] = meta["time_period"][index]["value"]    # 时期
            dd["dated"] = meta["dated"][index]["value"]
            dd["date_begin"] = meta["date_begin"][index]["value"]
            dd["date_end"] = meta["date_end"][index]["value"]
            dd["dimensions"] = meta["dimensions"][index]["value"]      # 规格
            dd["medium"] = meta["medium"][index]["value"]              # 材质
            dd["inscription"] = meta["inscription"][index]["value"]    
            dd["previous_owner"] = meta["previous_owner"][index]["value"]   # 之前拥有者
            dd["label"] = meta["label"][index]["value"]                 # 人物故事
            dd["makers"] = meta["makers"][index]["value"]               # 制作者
            dd["markings"] = meta["markings"][index]["value"]
            dd["notes"] = meta["notes"][index]["value"]
            dd["on_view"] = meta["on_view"][index]["boolean"]
            dd["on_view_location"] = meta["on_view_location"][index]["value"]
            dd["provenance"] = meta["provenance"][index]["value"]      # 出处（文物故事）
            dd["bibliography"] = meta["bibliography"][index]["value"]  # 参考文献
            dd["remarks"] = meta["remarks"][index]["value"]            
            dd["si_usage_statement"] = meta["si_usage_statement"][index]["value"]
            d["mata"] = dd
            # 获取图片链接 错误重试2次
            detail_error_count = 0
            while detail_error_count<3:
                try:
                    # 获取页面html
                    resp = requests.get(url=d["url"], headers=self.headers, timeout=60)
                    resp.encoding = "utf-8"
                    html = resp.text
                    # 定义正则表达式匹配图片链接
                    pattern = re.compile('data-file-name="(.*?)"')
                    img_code = pattern.findall(html)
                    dimg_url = {}
                    img_url_count = 0
                    for code in img_code:
                        url = 'https://ids.si.edu/ids/deliveryService?id={}'.format(code)
                        dimg_url["{}".format(str(img_url_count))] = url
                        # 保存图片url（保存速度太慢了 把图片下载单独拿出来）
                        with open("img_url.txt", encoding="utf-8", mode="a") as file:
                            file.write(url+"\n")
                        img_url_count += 1
                    d["img_url"] = dimg_url
                    detail_error_count = 4
                except requests.exceptions.RequestException:
                    detail_error_count += 1
                    print("getdetail error {}times".format(detail_error_count))
                # 出错三次就放弃 保存到error.txt
                if detail_error_count == 3:
                    detail_error_count = 4
                    self.error_log(d["url"]+"\n")
            with open("freersackler.json", encoding="utf-8", mode="a") as file:
                file.write("\""+str((i-1)*21+j+1)+"\":"+json.dumps(d)+",")

    def get_objects(self, start, end):
        """抓取搜索结果页面所有信息"""
        # 访问从m到n页的数据
        for i in range(start, end):
            print("第{}页".format(str(i)))
            data = {"query_vars":{"s":"China","terms":"{\"meta__post_type[]\":\"Objects\"}","paged":i,"sort":"score","view":"gallery"}}
            error_count = 0
            # 最多重试三次
            while error_count<3:
                try:
                    # 数据是封装在JavaScript里面的因此用json传参，结果封装为json
                    reqs = requests.post(url=self.url, json=data, headers=self.headers, timeout=60).json()
                    # 此时获取到的是整个页面的数据
                    posts = reqs["posts"]
                    # 获取详情信息
                    self.get_details(i, posts)
                    error_count = 4
                except requests.exceptions.RequestException:
                    error_count += 1
                    print("getlink error {}times".format(str(error_count)))
                if error_count==3:
                    error_count = 4
                    self.error_log("搜索页面错误:"+str(i)+"\n")
    
    def error_log(self, link):
        """存储错误信息"""
        with open("error.txt", encoding="utf-8", mode="a") as file:
            file.write(link)

    def run(self):
        start = 0
        end = 70
        for i in range(10):
            thread = threading.Thread(target=self.get_objects(start+i*70, end+i*70))
            thread.start()
            thread.join()


if __name__ == '__main__':
    freersackler = Freersackler_China()   
    freersackler.run()
